Load Theme for plots
theme_set(theme_fivethirtyeight())
theme_update(axis.title = element_text()) #the default for fivethirtyeight is to not show axis labels, this removes that default so we can choose to specify and display axis titles
theme_update(plot.title = element_text(hjust = 0.5)) # changing default to center all titles
Load Data downloaded from UCI and stored on github repo https://archive.ics.uci.edu/ml/datasets/Adult
adult = read.csv("/Users/Camo/Documents/SMU_DS/Applied Stats/Project2/Predicting_Income/adult.data", header = FALSE)
Description of variables from UCI:
Response: >50K, <=50K.
age: continuous. workclass: Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, State-gov, Without-pay, Never-worked. fnlwgt: continuous. education: Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 5th-6th, Preschool. education-num: continuous. marital-status: Married-civ-spouse, Divorced, Never-married, Separated, Widowed, Married-spouse-absent, Married-AF-spouse. occupation: Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, Armed-Forces. relationship: Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. race: White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. sex: Female, Male. capital-gain: continuous. capital-loss: continuous. hours-per-week: continuous. native-country: United-States, Cambodia, England, Puerto-Rico, Canada, Germany, Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands.
Add column names to data set:
# NOTE: names using underscore instead of hyphen so they can be referenced easier later
colnames(adult) <- c("age","workclass","fnlwgt","education","education_num","marital_status","occupation","relationship","race","sex","capital_gain","capital_loss","hours_per_week","native_country","income")
Investigate NA values to determine what needs resolution
aggr_plot <- aggr(adult, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(adult), cex.axis=.7, gap=3, ylab=c("Percent data missing","Combinations Missing"))
##
## Variables sorted by number of missings:
## Variable Count
## age 0
## workclass 0
## fnlwgt 0
## education 0
## education_num 0
## marital_status 0
## occupation 0
## relationship 0
## race 0
## sex 0
## capital_gain 0
## capital_loss 0
## hours_per_week 0
## native_country 0
## income 0
#Note there are not missing values showing up initially but that's because the missing values are represented by "?" instead of NA
#Replace "?" with NA and re-do missing value analysis
adult[, 1:14][adult[, 1:14] == " ?"] <- NA
aggr_plot <- aggr(adult, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(adult), cex.axis=.7, gap=3, ylab=c("Percent data missing","Combinations Missing"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## occupation 0.05660146
## workclass 0.05638647
## native_country 0.01790486
## age 0.00000000
## fnlwgt 0.00000000
## education 0.00000000
## education_num 0.00000000
## marital_status 0.00000000
## relationship 0.00000000
## race 0.00000000
## sex 0.00000000
## capital_gain 0.00000000
## capital_loss 0.00000000
## hours_per_week 0.00000000
## income 0.00000000
marginplot(adult[c(2,7)])
marginplot(adult[c(2,14)])
marginplot(adult[c(7,14)])
#occupation missing 5.66% of values
#workclass missing 5.64% of values
#native-country missing 1.79& of values
#Note that half of the missing workclass values occur on observations that are also missing occupation
Examine formats of data available
#Remove leading whitespace
adult[,c("workclass","education","marital_status","occupation","relationship","race","sex","native_country","income")]=as.data.frame(apply(adult[,c("workclass","education","marital_status","occupation","relationship","race","sex","native_country","income")],2,function(x)gsub('\\s+', '',x)))
#Convert character vars to factors and make list of vars
adult$workclass <- as.factor(adult$workclass)
adult$education <- as.factor(adult$education)
adult$marital_status <- as.factor(adult$marital_status)
adult$occupation <- as.factor(adult$occupation)
adult$relationship <- as.factor(adult$relationship)
adult$race <- as.factor(adult$race)
adult$sex <- as.factor(adult$sex)
adult$native_country <- as.factor(adult$native_country)
adult$income <- as.factor(adult$income)
categorical.explanatory = c("workclass","education","marital_status","occupation","relationship","race","sex","native_country")
str(adult)
## 'data.frame': 32561 obs. of 15 variables:
## $ age : int 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : Factor w/ 8 levels "Federal-gov",..: 7 6 4 4 4 4 4 6 4 4 ...
## $ fnlwgt : int 77516 83311 215646 234721 338409 284582 160187 209642 45781 159449 ...
## $ education : Factor w/ 16 levels "10th","11th",..: 10 10 12 2 10 13 7 12 13 10 ...
## $ education_num : int 13 13 9 7 13 14 5 9 14 13 ...
## $ marital_status: Factor w/ 7 levels "Divorced","Married-AF-spouse",..: 5 3 1 3 3 3 4 3 5 3 ...
## $ occupation : Factor w/ 14 levels "Adm-clerical",..: 1 4 6 6 10 4 8 4 10 4 ...
## $ relationship : Factor w/ 6 levels "Husband","Not-in-family",..: 2 1 2 1 6 6 2 1 2 1 ...
## $ race : Factor w/ 5 levels "Amer-Indian-Eskimo",..: 5 5 5 3 3 5 3 5 5 5 ...
## $ sex : Factor w/ 2 levels "Female","Male": 2 2 2 2 1 1 1 2 1 2 ...
## $ capital_gain : int 2174 0 0 0 0 0 0 0 14084 5178 ...
## $ capital_loss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hours_per_week: int 40 13 40 40 40 40 16 45 50 40 ...
## $ native_country: Factor w/ 41 levels "Cambodia","Canada",..: 39 39 39 39 5 39 23 39 39 39 ...
## $ income : Factor w/ 2 levels "<=50K",">50K": 1 1 1 1 1 1 1 2 2 2 ...
summary(adult)
## age workclass fnlwgt
## Min. :17.00 Private :22696 Min. : 12285
## 1st Qu.:28.00 Self-emp-not-inc: 2541 1st Qu.: 117827
## Median :37.00 Local-gov : 2093 Median : 178356
## Mean :38.58 State-gov : 1298 Mean : 189778
## 3rd Qu.:48.00 Self-emp-inc : 1116 3rd Qu.: 237051
## Max. :90.00 (Other) : 981 Max. :1484705
## NA's : 1836
## education education_num marital_status
## HS-grad :10501 Min. : 1.00 Divorced : 4443
## Some-college: 7291 1st Qu.: 9.00 Married-AF-spouse : 23
## Bachelors : 5355 Median :10.00 Married-civ-spouse :14976
## Masters : 1723 Mean :10.08 Married-spouse-absent: 418
## Assoc-voc : 1382 3rd Qu.:12.00 Never-married :10683
## 11th : 1175 Max. :16.00 Separated : 1025
## (Other) : 5134 Widowed : 993
## occupation relationship race
## Prof-specialty : 4140 Husband :13193 Amer-Indian-Eskimo: 311
## Craft-repair : 4099 Not-in-family : 8305 Asian-Pac-Islander: 1039
## Exec-managerial: 4066 Other-relative: 981 Black : 3124
## Adm-clerical : 3770 Own-child : 5068 Other : 271
## Sales : 3650 Unmarried : 3446 White :27816
## (Other) :10993 Wife : 1568
## NA's : 1843
## sex capital_gain capital_loss hours_per_week
## Female:10771 Min. : 0 Min. : 0.0 Min. : 1.00
## Male :21790 1st Qu.: 0 1st Qu.: 0.0 1st Qu.:40.00
## Median : 0 Median : 0.0 Median :40.00
## Mean : 1078 Mean : 87.3 Mean :40.44
## 3rd Qu.: 0 3rd Qu.: 0.0 3rd Qu.:45.00
## Max. :99999 Max. :4356.0 Max. :99.00
##
## native_country income
## United-States:29170 <=50K:24720
## Mexico : 643 >50K : 7841
## Philippines : 198
## Germany : 137
## Canada : 121
## (Other) : 1709
## NA's : 583
#This is where I started making changes after Rick’s contribution above
Notes: -I corrected white space issue before variable types were corrected -I added additional libraries at top
Remaining missing values visualized
#Proportion and total missing values
missing=data.frame(miss_var_cumsum(adult))
missing$prop_miss = missing$n_miss/dim(adult)[1]
missing[,-3] %>% arrange(-n_miss)
## variable n_miss prop_miss
## 1 occupation 1843 0.05660146
## 2 workclass 1836 0.05638647
## 3 native_country 583 0.01790486
## 4 age 0 0.00000000
## 5 fnlwgt 0 0.00000000
## 6 education 0 0.00000000
## 7 education_num 0 0.00000000
## 8 marital_status 0 0.00000000
## 9 relationship 0 0.00000000
## 10 race 0 0.00000000
## 11 sex 0 0.00000000
## 12 capital_gain 0 0.00000000
## 13 capital_loss 0 0.00000000
## 14 hours_per_week 0 0.00000000
## 15 income 0 0.00000000
#Chart of missing values by variable
gg_miss_var(adult)
Looking at categorical variable class balance
cat_vars=adult[,c("workclass","education","marital_status","occupation","relationship","race","sex","native_country","income")]
##Notes on Workclass
#Disproportionate NAs on <=50K
#Significant difference in self-emp-inc
#Small differences elsewhere
ggplot(cat_vars, aes(x= workclass, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="workclass") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
##Notes on Education
#Assoc-voc and Assoc-acdm only not sig diff
ggplot(cat_vars, aes(x= education, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="education") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
##Notes on Marital Status
#Very sig differences in Married-civ-spouse, never-married,and divorced
ggplot(cat_vars, aes(x= marital_status, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="marital_status") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
ggplot(cat_vars, aes(x= occupation, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="occupation") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
ggplot(cat_vars, aes(x= relationship, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="relationship") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
ggplot(cat_vars, aes(x= race, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="race") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
ggplot(cat_vars, aes(x= sex, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="sex") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
ggplot(cat_vars, aes(x= native_country, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="native_country") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
filt_cat_vars=filter(cat_vars, native_country != "United-States")
summary(filt_cat_vars$native_country)
## Cambodia Canada
## 19 121
## China Columbia
## 75 59
## Cuba Dominican-Republic
## 95 70
## Ecuador El-Salvador
## 28 106
## England France
## 90 29
## Germany Greece
## 137 29
## Guatemala Haiti
## 64 44
## Holand-Netherlands Honduras
## 1 13
## Hong Hungary
## 20 13
## India Iran
## 100 43
## Ireland Italy
## 24 73
## Jamaica Japan
## 81 62
## Laos Mexico
## 18 643
## Nicaragua Outlying-US(Guam-USVI-etc)
## 34 14
## Peru Philippines
## 31 198
## Poland Portugal
## 60 37
## Puerto-Rico Scotland
## 114 12
## South Taiwan
## 80 51
## Thailand Trinadad&Tobago
## 18 19
## United-States Vietnam
## 0 67
## Yugoslavia
## 16
ggplot(filt_cat_vars, aes(x= native_country, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="native_country") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
ggplot(cat_vars, aes(x= income)) + geom_bar()
#75.919% of observations are <50K
dim(cat_vars[cat_vars$income=='<=50K',])[1]/
(dim(cat_vars[cat_vars$income=='>50K',])[1]+dim(cat_vars[cat_vars$income=='<=50K',])[1])
## [1] 0.7591904
Looking at Numerical Variables
num_vars=adult %>% select_if(is.numeric)
num_vars$income=adult$income
num_vars$income=factor(num_vars$income)
variable.names(num_vars)
## [1] "age" "fnlwgt" "education_num" "capital_gain"
## [5] "capital_loss" "hours_per_week" "income"
plot(num_vars[,1:6])
num_vars %>% ggplot(aes(x=income,y=age))+geom_boxplot()
#older people have higher income
num_vars %>% ggplot(aes(x=income,y=fnlwgt))+geom_boxplot()
#Education is significantly higher (we should think about if it will cause an issue to include both education variables)
num_vars %>% ggplot(aes(x=income,y=education_num))+geom_boxplot()
#Investigate probability if gains are more likely for higher income
num_vars %>% ggplot(aes(x=income,y=capital_gain))+geom_boxplot()
#Testing relation of gains and losses on income
test1=num_vars
test2=num_vars
test1$gains=ifelse(test1$capital_gain>0,'Gain','No Gain')
test2$losses=ifelse(test1$capital_loss>0,'Loss','No Loss')
test1$gains=factor(test1$gains)
test2$losses=factor(test2$losses)
ggplot(test1, aes(x= gains, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="gains") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
ggplot(test2, aes(x= losses, group=income)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..)), stat="count") +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= "count", vjust = -.5) +
labs(y = "Percent", fill="gains") +
facet_grid(~income) +
scale_y_continuous(labels = scales::percent)+coord_flip()+theme(legend.position = "none")
#Investigate probability if losses are more likely for lower income
num_vars %>% ggplot(aes(x=income,y=capital_loss))+geom_boxplot()
num_vars %>% ggplot(aes(x=income,y=hours_per_week))+geom_boxplot()
#Generate a correlation and significance table and plot for all vars
res2 <- rcorr(as.matrix(num_vars[,1:6]),type="pearson")
corrplot(res2$r, type="upper", order="hclust",
p.mat = res2$P, sig.level = 0.1, insig = "blank")
#Test variables with Wilcox Test for significance
vars_num=variable.names(num_vars[,-7])
p_matrix= matrix(nrow=length(num_vars)-1, ncol=3)
for (i in 1:length(vars_num)){
a=wilcox.test(num_vars[,(i)]~num_vars$income,alternative="two.sided")
p_matrix[i,1]=vars_num[i]
p_matrix[i,2]=a[[3]]
p_matrix[i,3]=ifelse(a[[3]]<=.05,"keep","remove")
}
p_matrix
## [,1] [,2] [,3]
## [1,] "age" "0" "keep"
## [2,] "fnlwgt" "0.0526819376884781" "remove"
## [3,] "education_num" "0" "keep"
## [4,] "capital_gain" "0" "keep"
## [5,] "capital_loss" "7.02129357933078e-143" "keep"
## [6,] "hours_per_week" "0" "keep"